Here we show the code to reproduce the analyses of: Risso and Pagnotta (2020). Per-sample standardization and asymmetric winsorization lead to accurate classification of RNA-seq expression profiles. In preparation.

This file belongs to the repository: https://github.com/drisso/awst_analysis.

The code is released with license GPL v3.0.

Install and load awst

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("drisso/awst")
rm(list = ls())
library(readr)
library(awst)
library(EDASeq)
library(Rtsne)
library(umap)
library(dendextend)
setwd("~/Dropbox/AWST/gitHub/cbmc20200131") # 20200318
jobName <- "CBMC20200318"

Data import and cleaning

The data are available on GEO with the following accession number.

In the following chunks of code, we arrange a data-frame where each of the cell are annotated, and the annotations are coded as colors.

We mostly follow the original article for data preprocessing.

Note that here we download and read the data directly from the remote gzipped files available in GEO. In some cases, with unstable internet connections, the following chunks may not work. In such cases, we suggest to “manually” downaload the data using the links provided above and read the data in R from a local copy of the file.

ADT and annotation

if(!file.exists("annotation.RData")) {
  positive.col <-  "red"
  negative.col <- "gray"
  ttable <- read_csv("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE100866&format=file&file=GSE100866_PBMC_vs_flow_10X-ADT_umi.csv.gz")
  ttable <- as.data.frame(ttable)
  rownames(ttable) <- ttable[,1]
  ttable <- ttable[,-1]
  ttable <- 1 + t(ttable)
  tmp <- exp(log(apply(ttable, 1, prod))/ncol(ttable))
  ttable <- log(ttable/tmp)
  ttable <- 6 * (pnorm(ttable) - 0.5)
  annotation.df <- as.data.frame(ttable)
  annotation.df$cell <- rownames(annotation.df)
  nBins <- 9
  bbreaks <- seq(-3, 3, length.out = nBins + 1)
  levels_cols <- c("green4", "green3", "green2", "green", "gray75", "red", "red2", "red3", "red4")
  j <- 0
  while(j < ncol(ttable)) {
    j <- j + 1
    annotation.df$tmp <- cut(ttable[, j], breaks = bbreaks, include.lowest = TRUE)
    levels(annotation.df$tmp) <- levels_cols
    colnames(annotation.df)[ncol(annotation.df)] <- paste0(colnames(ttable)[j], ".col")
  }
  save(annotation.df, file = "annotation.RData")
} else load("annotation.RData")

Single-cell RNA-seq

Before applying normalization and AWST, we filtered out cells that did not met the following criteria:

  • remove the genes detected in no cells.

  • filter out cells not having at least 500 observed features.

if(!file.exists("Level3.RData")) {
ddata <- read_csv("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE100866&format=file&file=GSE100866_PBMC_vs_flow_10X-RNA_umi.csv.gz")

#restrict the features to the HUMAN ones
ddata <- ddata[grep("^HUMAN_", ddata$X1),]
ddata <- as.data.frame(ddata)
rownames(ddata) <- gsub("HUMAN_", "", ddata$X1)
ddata <- as.matrix(ddata[,-1])
save(ddata, file = "Level3.RData")
} else if(!file.exists("normCounts.RData")) load("Level3.RData")

Apply AWST

if(!file.exists("normCounts.RData")) {
  dim(ddata <- t(ddata)) # 7985 17014
  ###
  no_of_detected_gene_per_sample <- rowSums(ddata > 0) 
  fivenum(no_of_detected_gene_per_sample)#  10      638      739      873         4833 
  # restrict the collection of cells to those cells having at least 500 observed features
  sum(no_of_detected_gene_per_sample > 500) # 7613
  dim(ddata <- ddata[no_of_detected_gene_per_sample > 500,])#[1]  7613 17014
  # apply the full quantile normalization 
  normCounts <- EDASeq::betweenLaneNormalization(t(ddata), which = "full", round = FALSE)
  save(normCounts, file = "normCounts.RData")
} else if(!file.exists("expression.RData")) load("normCounts.RData")
  
# apply the AWS-transformation 
if(!file.exists("expression.RData")) {
  exprData <- awst(normCounts, poscount = TRUE, full_quantile = TRUE)
  dim(exprData <- gene_filter(exprData)) #[1] 7613  230
  save(exprData, file = "expression.RData")
} else load("expression.RData")

Clustering and dimensionality reduction

Note that due to changes to the pseudo-random number generator in R 3.6, the behavior of set.seed() has changed. Hence, the t-SNE and UMAP plots below are not exact copies of the ones in the paper, obtained with an older version of R. However, the main features of the datasets are preserved.

if(!file.exists("expression_dist_hclust.RData")) {
  nrow_exprData <- nrow(exprData)
  ncol_exprData <- ncol(exprData)
  ddist <- dist(exprData)
#  save(ddist, nrow_exprData, ncol_exprData, file = "expression_dist.RData")
  hhc <- hclust(ddist, method = "ward.D2")
  save(hhc, nrow_exprData, ncol_exprData, file = "expression_dist_hclust.RData")
} else load("expression_dist_hclust.RData")

if(!file.exists("expression_prcomp.RData")) {
  pprcomp <- prcomp(exprData)
  pprcomp$x <- pprcomp$x[, 1:10]
  pprcomp$rotation <- pprcomp$rotation[, 1:10]
  save(pprcomp, file = "expression_prcomp.RData")
} else load("expression_prcomp.RData")

if(!file.exists("expression_Rtsne_2d.RData")) {
  set.seed(2019) # needed to get the figure in the paper
  ans_Rtsne <- Rtsne(exprData, pca = FALSE) # Run TSNE
  save(ans_Rtsne, file = "expression_Rtsne_2d.RData")
} else load("expression_Rtsne_2d.RData")

if(!file.exists("expression_umap_2d.RData")) {
  set.seed(2019) # needed to get the figure in the paper
  ans_umap <- umap(exprData)
  save(ans_umap, file = "expression_umap_2d.RData")
} else load("expression_umap_2d.RData")
load("annotation.RData")
load("expression_dist_hclust.RData")
annotation.df <- annotation.df[hhc$labels,]
###############
save_plots <- FALSE
png_width_large <- 2100
png_height_large <- 500
png_width_small <- width_png <- 700
png_height_small <- 700
png_res <- 1/300
###################
color.bar2 <- function(x_pos, y_pos, lut, min, max=-min, nticks=11, ticks=seq(min, max, len=nticks), title='', values = NULL) {
    scale = (max-min)/length(lut)*0.3
    for (i in 1:length(lut)) {
      y_low  <- (i-1)*scale + min + y_pos
        y_high <-  y_low + scale
        rect(x_pos,y_low,x_pos+.05,y_high, col=lut[i], border=NA)
        text(x_pos+.05, (y_low + y_high)/2, values[i], adj = -0.1)
    }   
}
vvalues <- c("-3.0", "-2.0", "-1.3", "-0.6", " 0.0", " 0.6", " 1.3", " 2.0", " 3.0")
ffill2 <- names(table(annotation.df$CD3.col))

Main Clustering

clustering.prefix <- "CBMC"; short.prefix <- "CBMC"
clustering.df <- data.frame(cell = annotation.df$cell)
rownames(clustering.df) <- clustering.df$cell
#clustering.df <- clustering.df[hhc$labels,]
############
#load(paste0(jobName, "_dist_hclust.RData"))
mmain  <-  paste0("CBMC study (", nrow_exprData, " cells/", ncol_exprData, " genes)")
if(save_plots) {
  mmain <- ""
  png(paste0(jobName, "_expression_dist_hclust.png"), 
      width= png_width_large, height= png_height_large, res = png_res)
}
hhc$height <- hhc$height/max(hhc$height)
#mmain = paste0(jobName, " (", nrow_exprData, " cells/", ncol_exprData, " genes)")
plot(hhc, hang = -1, labels = FALSE, xlab = "", sub = "", main = mmain)
###
wwhere <- 6
hh <- mean(c(hhc$height[length(hhc$height)-wwhere+2], hhc$height[length(hhc$height)-wwhere+1]))
abline(h = hh, col = "red")
tmp <- tmp_ <- as.factor(cutree(hhc, k = wwhere))
#table(tmp)
#levels(tmp) <- c( "1", "1", "3", "4", "5", "6", "6", "8","4", "5")
wwhere <- length(unique(levels(tmp)))
clusteringWhere <- paste0(clustering.prefix, wwhere)
clusteringWhere.col <- paste0(clusteringWhere, ".col")
assign(clusteringWhere.col, tmp)
levels(tmp) <- paste0(short.prefix, wwhere, 1:wwhere)
assign(clusteringWhere, tmp)
levels(tmp) <- palette()[1:wwhere]
#table(tmp_, tmp)
assign(clusteringWhere.col, tmp)
tt <- table(get(clusteringWhere), get(clusteringWhere.col))
colorCode <- colnames(tt)[apply(tt, 1, which.max)]
names(colorCode) <- rownames(tt)
assign(paste0(clusteringWhere, ".colorCode"), colorCode)
clust.colorCode <- colorCode
clustering.df$tmp <- get(clusteringWhere)
clustering.df$tmp.explanatory <- clustering.df$tmp
clustering.df$tmp.col <- get(clusteringWhere.col)
ncol_ <- ncol(clustering.df)
colnames(clustering.df)[(ncol_-2):ncol_] <- c(clusteringWhere, paste0(clusteringWhere, ".explanatory"), clusteringWhere.col)
levels(clustering.df[, paste0(clusteringWhere, ".explanatory")]) <- c("CBMC1 - T Cell", "CBMC2 - B Cell", "CBMC3 - unclear", "CBMC4 - Natutal Killer", "CBMC5 - Monocyte", "CBMC6 - myeloid DC")


annotation.col <- annotation.df[, grep(".col", colnames(annotation.df))]
colnames(annotation.col) <- gsub(".col", "", colnames(annotation.col))
annotation.col <- annotation.col[, rev(c( "CD19", "CD3",  "CD11c", "CD14","CD4","CD8", "CD2","CD57"))]
annotation.col$CBMC <- clustering.df[, ncol(clustering.df)]

colored_bars(colors = annotation.col, dend = as.dendrogram(hhc), y_scale = 0.29, y_shift = 0.0045)
save(clustering.df, clust.colorCode, file = "clustering.RData")

tt <- table(clustering.df[, ncol(clustering.df)-1])
pct <- paste0(round(100*tt/sum(tt), 1), "%")
llegend <- paste(names(tt), " (", tt, "; ", pct, ")", sep = "")
tt <- table(clustering.df[, ncol(clustering.df)-1], clustering.df[, ncol(clustering.df)])
ffill <- colnames(tt)[apply(tt, 1, which.max)]
legend(1, .85, legend=llegend, fill = ffill, y.intersp = 1, box.col = "white", border = "white", title = "CBMC", title.adj = 0)

color.bar3 <- function(x_pos, y_pos, lut, min, max=-min, nticks=11, ticks=seq(min, max, len=nticks), title='', values = NULL) {
    scale = (max-min)/length(lut)*0.3
#    plot(c(0,10), c(min,max), type='n', bty='n', xaxt='n', xlab='', yaxt='n', ylab='', main=title, add = TRUE)
    #axis(2, ticks, las=1)
    for (i in 1:length(lut)) {
      y_low  <- (i-1)*scale + y_pos
        y_high <-  y_low + scale
        rect(x_pos,y_low,x_pos+90,y_high, col=lut[i], border=NA)
        text(x_pos+.05, (y_low + y_high)/2, values[i], adj = -2)
    }   
}
color.bar3(7230, 0.65, ffill2, -0.6, values = vvalues)
text(7275, 0.63, "Marker's level")
text(1, 1, "(a)")

####
#fName <- paste0(jobName, "_metadata.tsv")
#write.table(annotation.col, file = fName, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)

PCA (AWST) | …

x_legend <- 0.08; y_legend <- -1.75
x_text <- -2.3; y_text <- 2.
load("expression_prcomp.RData")
pprcomp$x <- scale(pprcomp$x)
#
fName <- paste0(jobName, "_expression_prcomp.tsv")
write.table(pprcomp$x[, 1:3], file = fName, sep = "\t", row.names = FALSE, col.names = FALSE)
#
mmain = paste0("Principal components analysis (", nrow_exprData, " cells/", ncol_exprData, " features)")
cat(sprintf("\n\n### %s\n\n", mmain))

Principal components analysis (7613 cells/230 features)

if(save_plots) png(paste0(jobName, "_expression_prcomp_CITE6.png"), width= png_width_small, height= png_height_small, res = png_res)
plot(pprcomp$x, col = clustering.df$CBMC6.col, main = mmain, 
     xlab = "first principal component", ylab = "second principal component", pch = 19)
legend(x_legend, y_legend, legend=llegend, fill = ffill, y.intersp = 1, box.col = "white", border = "white", title = "CBMC", title.adj = 0)
text(x_text, y_text, "(b)")

###
mmain <- "Principal component analysis - CD3"#  "prcomp (AWST)| flow cytometry/CD3"
cat(sprintf("\n\n### %s\n\n", mmain))

Principal component analysis - CD3

if(save_plots) png(paste0(jobName, "_expression_prcomp_CD3.png"), width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD3.col), main = mmain, 
     xlab = "first principal component", ylab = "second principal component", pch = 19)
ffill2 <- names(table(annotation.df$CD3.col))
color.bar2(-2.3, 0.3, ffill2, -3, values = vvalues)
text(x_text, y_text, "(a)")

#
mmain <- "Principal component analysis - CD19"#  "prcomp (AWST)| flow cytometry/CD19"
cat(sprintf("\n\n### %s\n\n", mmain))

Principal component analysis - CD19

if(save_plots) png(paste0(jobName, "_expression_prcomp_CD19.png"), width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD19.col), main = mmain, 
     xlab = "first principal component", ylab = "second principal component", pch = 19)
color.bar2(-2.3, 0.3, ffill2, -3, values = vvalues)
text(x_text, y_text, "(b)")

#
mmain <-  "Principal component analysis - CD11c"#  "prcomp (AWST)| flow cytometry/CD11c"
cat(sprintf("\n\n### %s\n\n", mmain))

Principal component analysis - CD11c

if(save_plots) png(paste0(jobName, "_expression_prcomp_CD11c.png"), width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD11c.col), main = mmain, 
     xlab = "first principal component", ylab = "second principal component", pch = 19)
color.bar2(-2.3, 0.3, ffill2, -3, values = vvalues)
text(x_text, y_text, "(c)")

#
mmain <-  "Principal component analysis - CD14"#  "prcomp (AWST)| flow cytometry/CD14"
cat(sprintf("\n\n### %s\n\n", mmain))

Principal component analysis - CD14

if(save_plots) png(paste0(jobName, "_expression_prcomp_CD14.png"), width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD14.col), main = mmain, 
     xlab = "first principal component", ylab = "second principal component", pch = 19)
color.bar2(-2.3, 0.3, ffill2, -3, values = vvalues)
text(x_text, y_text, "(d)")

######### CD4
mmain <- "Principal component analysis - CD4"#  "prcomp (AWST)| flow cytometry/CD4"
cat(sprintf("\n\n### %s\n\n", mmain))

Principal component analysis - CD4

if(save_plots) png(paste0(jobName, "_expression_prcomp_CD4.png"), width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD4.col), main = mmain, 
     xlab = "first principal component", ylab = "second principal component", pch = 19)
color.bar2(-2.3, 0.3, ffill2, -3, values = vvalues)
text(x_text, y_text, "(e)")

######### CD8
mmain <- "Principal component analysis - CD8"#  "prcomp (AWST)| flow cytometry/CD8"
cat(sprintf("\n\n### %s\n\n", mmain))

Principal component analysis - CD8

if(save_plots) png(paste0(jobName, "_expression_prcomp_CD8.png"), width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD8.col), main = mmain, 
     xlab = "first principal component", ylab = "second principal component", pch = 19)
color.bar2(-2.3, 0.3, ffill2, -3, values = vvalues)
text(x_text, y_text, "(f)")

######### CD2
mmain <- "Principal component analysis - CD2"#  "prcomp (AWST)| flow cytometry/CD8"
cat(sprintf("\n\n### %s\n\n", mmain))

Principal component analysis - CD2

if(save_plots) png(paste0(jobName, "_expression_prcomp_CD2.png"), width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD2.col), main = mmain, 
     xlab = "first principal component", ylab = "second principal component", pch = 19)
color.bar2(-2.3, 0.3, ffill2, -3, values = vvalues)
text(x_text, y_text, "(g)")

######### CD57
mmain <- "Principal component analysis - CD57"#  "prcomp (AWST)| flow cytometry/CD8"
cat(sprintf("\n\n### %s\n\n", mmain))

Principal component analysis - CD57

if(save_plots) png(paste0(jobName, "_expression_prcomp_CD57.png"), width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD57.col), main = mmain, 
     xlab = "first principal component", ylab = "second principal component", pch = 19)
color.bar2(-2.3, 0.3, ffill2, -3, values = vvalues)
text(x_text, y_text, "(h)")

Rtsne (AWST) | clustering

load("expression_Rtsne_2d.RData")
ans_Rtsne$Y <- scale(ans_Rtsne$Y)
if(save_plots) png(paste0(jobName, "_expression_Rtsne.png"), width= png_width_small, height= png_height_small, res = png_res)
mmain = "T-distributed Stochastic Neighbor Embedding (tsne)"
plot(-ans_Rtsne$Y[, 1], ans_Rtsne$Y[, 2], col = clustering.df$CBMC6.col, main = mmain, xlab = "", ylab = "")
legend(-2.5, -1.1, legend=llegend, fill = ffill, y.intersp = 1, box.col = "white", border = "white", title = "CBMC", title.adj = 0)
text(-2.5, 1.8, "(c)")

umap (AWST) | clustering

load("expression_umap_2d.RData")
ans_umap$layout <- scale(ans_umap$layout)
if(save_plots) png(paste0(jobName, "_expression_umap.png"), width= png_width_small, height= png_height_small, res = png_res)
mmain = "Uniform Manifold Approximation and Projection (umap)"
plot(-ans_umap$layout[, 1], -ans_umap$layout[, 2], col = clustering.df$CBMC6.col, main = mmain, xlab = "", ylab = "")
legend(1, -1.5, legend=llegend, fill = ffill, y.intersp = 1, box.col = "white", border = "white", title = "CBMC", title.adj = 0)
text(-1.5, 1, "(d)")

Session info

sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.1
## 
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=it_IT.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=it_IT.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=it_IT.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=it_IT.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] dendextend_1.13.4          umap_0.2.5.0              
##  [3] Rtsne_0.15                 EDASeq_2.10.0             
##  [5] ShortRead_1.34.2           GenomicAlignments_1.12.2  
##  [7] SummarizedExperiment_1.6.5 DelayedArray_0.2.7        
##  [9] matrixStats_0.56.0         Rsamtools_1.28.0          
## [11] GenomicRanges_1.28.6       GenomeInfoDb_1.12.3       
## [13] Biostrings_2.44.2          XVector_0.16.0            
## [15] IRanges_2.10.5             S4Vectors_0.14.7          
## [17] BiocParallel_1.10.1        Biobase_2.36.2            
## [19] BiocGenerics_0.22.1        awst_0.0.4                
## [21] readr_1.3.1               
## 
## loaded via a namespace (and not attached):
##  [1] viridis_0.5.1           viridisLite_0.3.0       jsonlite_1.6.1         
##  [4] bit64_0.9-7             splines_3.4.4           R.utils_2.9.2          
##  [7] assertthat_0.2.1        aroma.light_3.6.0       latticeExtra_0.6-28    
## [10] blob_1.2.1              GenomeInfoDbData_0.99.0 yaml_2.2.1             
## [13] pillar_1.4.3            RSQLite_2.2.0           lattice_0.20-35        
## [16] glue_1.3.2              reticulate_1.14         digest_0.6.25          
## [19] RColorBrewer_1.1-2      colorspace_1.4-1        htmltools_0.4.0        
## [22] Matrix_1.2-18           R.oo_1.23.0             XML_3.99-0.3           
## [25] pkgconfig_2.0.3         biomaRt_2.32.1          genefilter_1.58.1      
## [28] zlibbioc_1.22.0         purrr_0.3.3             xtable_1.8-4           
## [31] scales_1.1.0            RSpectra_0.16-0         tibble_2.1.3           
## [34] openssl_0.9.9           annotate_1.54.0         ggplot2_3.3.0          
## [37] GenomicFeatures_1.28.5  survival_2.41-3         magrittr_1.5           
## [40] crayon_1.3.4            memoise_1.1.0           evaluate_0.14          
## [43] R.methodsS3_1.8.0       hwriter_1.3.2           tools_3.4.4            
## [46] hms_0.5.3               lifecycle_0.2.0         stringr_1.4.0          
## [49] munsell_0.5.0           AnnotationDbi_1.38.2    compiler_3.4.4         
## [52] DESeq_1.28.0            rlang_0.4.5             grid_3.4.4             
## [55] RCurl_1.98-1.1          bitops_1.0-6            rmarkdown_2.1          
## [58] gtable_0.3.0            DBI_1.1.0               R6_2.4.1               
## [61] gridExtra_2.3           dplyr_0.8.5             knitr_1.28             
## [64] rtracklayer_1.36.6      bit_1.1-15.2            stringi_1.4.6          
## [67] Rcpp_1.0.4              vctrs_0.2.4             geneplotter_1.54.0     
## [70] tidyselect_1.0.0        xfun_0.12